This script takes the raw data downloaded from Crimson Hexagon and cleans it up for analysis. At the bottom of the script there are a few exploratory maps.
knitr::opts_chunk$set(message = F, warning = F)
library(tidyverse)
library(jsonlite)
library(ggmap)
library(leaflet)
library(sf)
library(readxl)
library(reticulate)
library(RColorBrewer)
library(kableExtra)
library(mapview)
Crimson Hexagon data is saved in two day bulk exports. The CH website only allows exports of 10,000 randomly selected tweets. There seemed to be between 10-15k over any 2 day period so data was exported in 2-day chunks to try and get as much data as possible. Two filters were applied to the data before downloading - the location was set to Santa Barbara (this does not mean the tweet was geotagged but that it came from the area) and that it was an Original Tweet (not a retweet).
# list all .xlsx files
xl_files <- list.files("../data/daily", pattern = ".xlsx", full.names = TRUE)
ids <- data.frame()
for(i in 1:length(xl_files)){
print(i)
#get twitter IDs from the Crimson Hexagon output
ch_data <- read_excel(xl_files[i], skip = 1) %>%
select(GUID)
ids <- rbind(ch_data, ids)
}
nums <- seq(1, nrow(ids), length.out = 30)
for(i in 1:29){
n <- nums[i]
n2 <- nums[i+1]
df <- ids[n:n2,]
#save as .txt file to be read by the python twarc library
write.table(as.numeric(df$GUID), file = paste0("../data/twitter_ids_", i, ".txt"), sep = "\t",
row.names = FALSE, col.names = FALSE)
}
Now I use the python library, twarc in my terminal to “hydrate” the data using the tweet IDs. The Crimson Hexagon data does not give us much information but the twarc library lets us use the twitter id to grab a lot more information (including coordinates for geotagged tweets).
Once this is done, all tweets are saved in a JSON file.
# Give the input file name to the function.#
tweets1 <- stream_in(file("../data/tweets1.jsonl"))
tweets2 <- stream_in(file("../data/tweets2.jsonl"))
tweets3 <- stream_in(file("../data/tweets3.jsonl"))
tweets4 <- stream_in(file("../data/tweets4.jsonl"))
tweets5 <- stream_in(file("../data/tweets5.jsonl"))
tweets6 <- stream_in(file("../data/tweets6.jsonl"))
tweets7 <- stream_in(file("../data/tweets7.jsonl"))
tweets8 <- stream_in(file("../data/tweets8.jsonl"))
tweets9 <- stream_in(file("../data/tweets9.jsonl"))
tweets10 <- stream_in(file("../data/tweets10.jsonl"))
tweets11 <- stream_in(file("../data/tweets11.jsonl"))
tweets12 <- stream_in(file("../data/tweets12.jsonl"))
tweets13 <- stream_in(file("../data/tweets13.jsonl"))
tweets14 <- stream_in(file("../data/tweets14.jsonl"))
tweets15 <- stream_in(file("../data/tweets15.jsonl"))
tweets16 <- stream_in(file("../data/tweets16.jsonl"))
tweets17 <- stream_in(file("../data/tweets17.jsonl"))
tweets18 <- stream_in(file("../data/tweets18.jsonl"))
tweets19 <- stream_in(file("../data/tweets19.jsonl"))
tweets20 <- stream_in(file("../data/tweets20.jsonl"))
tweets21 <- stream_in(file("../data/tweets21.jsonl"))
tweets22 <- stream_in(file("../data/tweets22.jsonl"))
tweets23 <- stream_in(file("../data/tweets23.jsonl"))
tweets24 <- stream_in(file("../data/tweets24.jsonl"))
tweets25 <- stream_in(file("../data/tweets25.jsonl"))
tweets26 <- stream_in(file("../data/tweets26.jsonl"))
tweets27 <- stream_in(file("../data/tweets27.jsonl"))
tweets28 <- stream_in(file("../data/tweets28.jsonl"))
tweets29 <- stream_in(file("../data/tweets29.jsonl"))
create_tweet_df <- function(tweets){
#get the columns we want from the json (some are nested)
tweet_df <- as_tibble(cbind(
as.character(tweets$created_at),
as.numeric(tweets$id_str),
as.character(tweets$full_text),
as.numeric(tweets$user$id_str),
as.character(tweets$user$location),
as.character(tweets$geo$type),
as.character(tweets$geo$coordinates),
as.character(tweets$lang),
as.numeric(tweets$retweet_count),
as.numeric(tweets$favorite_count)))
#assign column names
names(tweet_df) <- c("created_at","tweet_id","full_text","user_id","user_location",
"geo_type", "geo_coordinates", "language", "retweet_count", "favorite_count")
## filter
tweets_geo <- tweet_df %>%
filter(!is.na(geo_type)) %>%
mutate(tweet_id = as.numeric(tweet_id),
user_id = as.numeric(user_id),
retweet_count = as.numeric(retweet_count),
favorite_count = as.numeric(favorite_count))
return(tweets_geo)
}
Apply function
df1 <- create_tweet_df(tweets1)
df2 <- create_tweet_df(tweets2)
df3 <- create_tweet_df(tweets3)
df4 <- create_tweet_df(tweets4)
df5 <- create_tweet_df(tweets5)
df6 <- create_tweet_df(tweets6)
df7 <- create_tweet_df(tweets7)
df8 <- create_tweet_df(tweets8)
df9 <- create_tweet_df(tweets9)
df10 <- create_tweet_df(tweets10)
df11 <- create_tweet_df(tweets11)
df12 <- create_tweet_df(tweets12)
df13 <- create_tweet_df(tweets13)
df14 <- create_tweet_df(tweets14)
df15 <- create_tweet_df(tweets15)
df16 <- create_tweet_df(tweets16)
df17 <- create_tweet_df(tweets17)
df18 <- create_tweet_df(tweets18)
df19 <- create_tweet_df(tweets19)
df20 <- create_tweet_df(tweets20)
df21 <- create_tweet_df(tweets21)
df22 <- create_tweet_df(tweets22)
df23 <- create_tweet_df(tweets23)
df24 <- create_tweet_df(tweets24)
df25 <- create_tweet_df(tweets25)
df26 <- create_tweet_df(tweets26)
df27 <- create_tweet_df(tweets27)
df28 <- create_tweet_df(tweets28)
df29 <- create_tweet_df(tweets29)
Combine
all_df <- bind_rows(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14,df15, df16, df17, df18, df19, df20, df21, df22, df23, df24, df25, df26, df27, df28, df29)
Remove points outside of our bounding box, which is c(-119.9,34.38,-119.5,34.48)
# create new df with just the tweet texts & usernames
tweet_data <- all_df %>%
mutate(coords = gsub("\\)|c\\(", "", geo_coordinates)) %>%
separate(coords, c("lat", "lon"), sep = ", ") %>%
mutate_at(c("lon", "lat"), as.numeric) %>%
filter(lat >=33.88 & lat <= 34.6,
lon <= -119.5 & lon >= -120.5)
write_csv(tweet_data, "../data/geotagged_sb_tweets.csv")
Turn the tweet_df_w_user_type data frame into a spatial object.
tweet_data <- read_csv("../data/geotagged_sb_tweets.csv")
tweet_sf <- tweet_data %>%
st_as_sf(coords = c("lon", "lat")) %>%
st_set_crs(4326)
#map
map <- leaflet(tweet_data) %>%
# Base groups
addProviderTiles(providers$CartoDB.Positron) %>%
# Overlay groups %>%
addCircleMarkers(data = tweet_data, lng = ~lon, lat = ~lat, popup = ~full_text,
radius = 3, stroke = FALSE, fillOpacity = 0.5, clusterOptions = markerClusterOptions())
mapshot(map, "../figs/all_tweet_map_cluster_markers.html")
map